本例展示如何在alpha-mind中使用深度学习模型。
请在环境变量中设置DB_URI
指向数据库
In [1]:
%matplotlib inline
import os
import datetime as dt
import numpy as np
import pandas as pd
from alphamind.api import *
from PyFin.api import *
import tensorflow as tf
from alphamind.model.modelbase import create_model_base
In [2]:
class LinearRegressionImpl(object):
def __init__(self, **kwargs):
self.learning_rate = kwargs.get('learning_rate', 0.1)
self.training_epochs = kwargs.get('training_epochs', 100)
self.display_steps = kwargs.get('display_steps', None)
self.W = None
self.b = None
def result(self):
with tf.Session() as sess:
ret = [sess.run(self.W), sess.run(self.b)]
return ret
def fit(self, x, y):
num_samples, num_features = x.shape
X = tf.placeholder("float", shape=[None, num_features])
Y = tf.placeholder("float", shape=[None, 1])
W = tf.Variable(np.random.randn(num_features, 1).astype(np.float32), name="weight")
b = tf.Variable(np.random.randn(), name="bias")
pred = tf.add(tf.matmul(X, W), b)
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / (2 * num_samples)
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(self.training_epochs):
for (train_x, train_y) in zip(x, y):
sess.run(optimizer, feed_dict={X: train_x.reshape(1, -1), Y: train_y.reshape(1, -1)})
if self.display_steps is not None and (epoch + 1) % self.display_steps == 0:
c = sess.run(cost, feed_dict={X: x, Y: y})
print("Epoch: {0}, cost = {1}, W = {2}, b = {3}".format(epoch + 1, c, W, b))
print('Optimization finished ......')
training_cost = sess.run(cost, feed_dict={X: x, Y: y.reshape(-1, 1)})
self.W = sess.run(W)
self.b = sess.run(b)
print('Training cost = {0}, W = {1}, b = {2}'.format(training_cost, self.W, self.b))
def predict(self, x):
num_features = x.shape[1]
X = tf.placeholder("float", shape=[None, num_features])
with tf.Session() as sess:
ret = sess.run(tf.add(tf.matmul(X, self.W), self.b), feed_dict={X: x})
return np.squeeze(ret)
为了与alpha-mind的框架对接,还需要定义如下一个wrapper。这个wrapper需要实现load 和save 两种方法。
In [3]:
class LinearRegressionTF(create_model_base()):
def __init__(self, features, fit_target, **kwargs):
super().__init__(features=features, fit_target=fit_target)
self.impl = LinearRegressionImpl(**kwargs)
@classmethod
def load(cls, model_desc: dict):
return super().load(model_desc)
def save(self):
model_desc = super().save()
model_desc['weight'] = self.impl.result()
return model_desc
In [4]:
freq = '60b'
universe = Universe('zz800')
batch = 1
neutralized_risk = industry_styles
risk_model = 'short'
pre_process = [winsorize_normal, standardize]
post_process = [standardize]
warm_start = 3
data_source = os.environ['DB_URI']
horizon = map_freq(freq)
engine = SqlEngine(data_source)
我们使用当期的roe_q
因子,来尝试预测未来大概一个月以后的roe_q
因子。
zz800
;;
In [5]:
kernal_feature = 'ROE'
regress_features = {kernal_feature: LAST(kernal_feature),
kernal_feature + '_l1': SHIFT(kernal_feature, 1),
kernal_feature + '_l2': SHIFT(kernal_feature, 2),
kernal_feature + '_l3': SHIFT(kernal_feature, 3)
}
fit_target = [kernal_feature]
data_meta = DataMeta(freq=freq,
universe=universe,
batch=batch,
neutralized_risk=neutralized_risk,
risk_model=risk_model,
pre_process=pre_process,
post_process=post_process,
warm_start=warm_start,
data_source=data_source)
regression_model_tf = LinearRegressionTF(features=regress_features, fit_target=fit_target, training_epochs=400, learning_rate=0.01)
regression_composer_tf = Composer(alpha_model=regression_model_tf, data_meta=data_meta)
In [10]:
ref_date = '2011-01-01'
ref_date = adjustDateByCalendar('china.sse', ref_date).strftime('%Y-%m-%d')
In [11]:
regression_model_sk = LinearRegression(features=regress_features, fit_target=fit_target)
regression_composer_sk = Composer(alpha_model=regression_model_sk, data_meta=data_meta)
In [ ]:
%%time
regression_composer_sk.train(ref_date)
regression_composer_tf.train(ref_date)
print("\nSklearn Regression Testing IC: {0:.4f}".format(regression_composer_sk.ic(ref_date=ref_date)[0]))
print("Tensorflow Regression Testing IC: {0:.4f}".format(regression_composer_tf.ic(ref_date=ref_date)[0]))
In [23]:
start_date = '2011-01-01'
end_date = '2012-01-01'
data_package2 = fetch_data_package(engine,
alpha_factors=regress_features,
start_date=start_date,
end_date=end_date,
frequency=freq,
universe=universe,
benchmark=906,
warm_start=warm_start,
batch=1,
neutralized_risk=neutralized_risk,
pre_process=pre_process,
post_process=post_process)
model_dates = [d.strftime('%Y-%m-%d') for d in list(data_package2['predict']['x'].keys())]
industry_name = 'sw_adj'
industry_level = 1
industry_names = industry_list(industry_name, industry_level)
industry_total = engine.fetch_industry_matrix_range(universe, dates=model_dates, category=industry_name, level=industry_level)
In [24]:
rets1 = []
rets2 = []
for i, ref_date in enumerate(model_dates):
py_ref_date = dt.datetime.strptime(ref_date, '%Y-%m-%d')
industry_matrix = industry_total[industry_total.trade_date == ref_date]
dx_returns = pd.DataFrame({'dx': data_package2['predict']['y'][py_ref_date].flatten(),
'code': data_package2['predict']['code'][py_ref_date].flatten()})
res = pd.merge(dx_returns, industry_matrix, on=['code']).dropna()
codes = res.code.values.tolist()
alpha_logger.info('{0} full re-balance: {1}'.format(ref_date, len(codes)))
## sklearn regression model
raw_predict1 = regression_composer_sk.predict(ref_date, x=data_package2['predict']['x'][py_ref_date])[0].loc[codes]
er1 = raw_predict1.fillna(raw_predict1.median()).values
target_pos1, _ = er_portfolio_analysis(er1,
res.industry_name.values,
None,
None,
False,
None,
method='ls')
target_pos1['code'] = codes
result1 = pd.merge(target_pos1, dx_returns, on=['code'])
ret1 = result1.weight.values @ (np.exp(result1.dx.values) - 1.)
rets1.append(np.log(1. + ret1))
## tensorflow regression model
raw_predict2 = regression_composer_tf.predict(ref_date, x=data_package2['predict']['x'][py_ref_date])[0].loc[codes]
er2 = raw_predict2.fillna(raw_predict2.median()).values
target_pos2, _ = er_portfolio_analysis(er2,
res.industry_name.values,
None,
None,
False,
None,
method='ls')
target_pos2['code'] = codes
result2 = pd.merge(target_pos2, dx_returns, on=['code'])
ret2 = result2.weight.values @ (np.exp(result2.dx.values) - 1.)
rets2.append(np.log(1. + ret2))
alpha_logger.info('{0} is finished'.format(ref_date))
In [25]:
ret_df = pd.DataFrame({'sklearn': rets1, 'tensorflow': rets2}, index=model_dates)
ret_df.loc[advanceDateByCalendar('china.sse', model_dates[-1], freq).strftime('%Y-%m-%d')] = 0.
ret_df = ret_df.shift(1)
ret_df.iloc[0] = 0.
ret_df[['sklearn', 'tensorflow']].cumsum().plot(figsize=(12, 6),
title='Fixed freq rebalanced: {0}'.format(freq))
Out[25]:
In [ ]: